{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "import os, argparse\n",
    "import numpy as np\n",
    "import cv2 as cv2\n",
    "import spacy as spacy\n",
    "import matplotlib.pyplot as plt\n",
    "from keras.models import Model, Input\n",
    "from keras.layers.core import Dense, Dropout, Reshape\n",
    "from keras.layers.recurrent import LSTM\n",
    "from keras.layers.merge import concatenate\n",
    "from keras.applications.vgg16 import VGG16\n",
    "from keras.preprocessing import image\n",
    "from keras.applications.vgg16 import preprocess_input\n",
    "import numpy as np\n",
    "from sklearn.externals import joblib\n",
    "import PIL.Image\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# mapping id -> labels for categories\n",
    "label_encoder_file_name = '/Users/gulli/Books/TF/code/git/tensorflowBook/Chapter5/FULL_labelencoder_trainval.pkl'\n",
    "# max length across corpus\n",
    "max_length_questions = 30\n",
    "# VGG output \n",
    "length_vgg_features = 4096\n",
    "# Embedding outout\n",
    "length_feature_space = 300\n",
    "# pre-trained weights\n",
    "VQA_weights_file = '/Users/gulli/Books/TF/code/git/tensorflowBook/Chapter5/VQA_MODEL_WEIGHTS.hdf5'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "'''image features'''\n",
    "def get_image_features(img_path, VGG16modelFull):\n",
    "    '''given an image returns a tensor with (1, 4096) VGG16 features'''\n",
    "    # Since VGG was trained as a image of 224x224, every new image\n",
    "    # is required to go through the same transformation\n",
    "    img = image.load_img(img_path, target_size=(224, 224))\n",
    "    x = image.img_to_array(img)\n",
    "    # this is required because of the original training of VGG was batch\n",
    "    # even if we have only one image we need to be consistent \n",
    "    x = np.expand_dims(x, axis=0)\n",
    "    x = preprocess_input(x)\n",
    "    features = VGG16modelFull.predict(x)\n",
    "    model_extractfeatures = Model(inputs=VGG16modelFull.input, \n",
    "                                  outputs=VGG16modelFull.get_layer('fc2').output)\n",
    "    fc2_features = model_extractfeatures.predict(x)\n",
    "    fc2_features = fc2_features.reshape((1, length_vgg_features))\n",
    "    return fc2_features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "'''embedding'''\n",
    "def get_question_features(question):\n",
    "    ''' given a question, a unicode string, returns the time series vector\n",
    "    with each word (token) transformed into a 300 dimension representation\n",
    "    calculated using Glove Vector '''\n",
    "    word_embeddings = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')\n",
    "    tokens = word_embeddings(question)\n",
    "    ntokens = len(tokens)\n",
    "    if (ntokens > max_length_questions) :\n",
    "        ntokens = max_length_questions\n",
    "    question_tensor = np.zeros((1, max_length_questions, 300))\n",
    "    for j in xrange(len(tokens)):\n",
    "            question_tensor[0,j,:] = tokens[j].vector\n",
    "    return question_tensor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "input_5 (InputLayer)         (None, 224, 224, 3)       0         \n",
      "_________________________________________________________________\n",
      "block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      \n",
      "_________________________________________________________________\n",
      "block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     \n",
      "_________________________________________________________________\n",
      "block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         \n",
      "_________________________________________________________________\n",
      "block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     \n",
      "_________________________________________________________________\n",
      "block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    \n",
      "_________________________________________________________________\n",
      "block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0         \n",
      "_________________________________________________________________\n",
      "block3_conv1 (Conv2D)        (None, 56, 56, 256)       295168    \n",
      "_________________________________________________________________\n",
      "block3_conv2 (Conv2D)        (None, 56, 56, 256)       590080    \n",
      "_________________________________________________________________\n",
      "block3_conv3 (Conv2D)        (None, 56, 56, 256)       590080    \n",
      "_________________________________________________________________\n",
      "block3_pool (MaxPooling2D)   (None, 28, 28, 256)       0         \n",
      "_________________________________________________________________\n",
      "block4_conv1 (Conv2D)        (None, 28, 28, 512)       1180160   \n",
      "_________________________________________________________________\n",
      "block4_conv2 (Conv2D)        (None, 28, 28, 512)       2359808   \n",
      "_________________________________________________________________\n",
      "block4_conv3 (Conv2D)        (None, 28, 28, 512)       2359808   \n",
      "_________________________________________________________________\n",
      "block4_pool (MaxPooling2D)   (None, 14, 14, 512)       0         \n",
      "_________________________________________________________________\n",
      "block5_conv1 (Conv2D)        (None, 14, 14, 512)       2359808   \n",
      "_________________________________________________________________\n",
      "block5_conv2 (Conv2D)        (None, 14, 14, 512)       2359808   \n",
      "_________________________________________________________________\n",
      "block5_conv3 (Conv2D)        (None, 14, 14, 512)       2359808   \n",
      "_________________________________________________________________\n",
      "block5_pool (MaxPooling2D)   (None, 7, 7, 512)         0         \n",
      "_________________________________________________________________\n",
      "flatten (Flatten)            (None, 25088)             0         \n",
      "_________________________________________________________________\n",
      "fc1 (Dense)                  (None, 4096)              102764544 \n",
      "_________________________________________________________________\n",
      "fc2 (Dense)                  (None, 4096)              16781312  \n",
      "_________________________________________________________________\n",
      "predictions (Dense)          (None, 1000)              4097000   \n",
      "=================================================================\n",
      "Total params: 138,357,544\n",
      "Trainable params: 138,357,544\n",
      "Non-trainable params: 0\n",
      "_________________________________________________________________\n",
      "(1, 4096)\n"
     ]
    }
   ],
   "source": [
    "image_file_name = 'girl.jpg'\n",
    "img0 = PIL.Image.open(image_file_name)\n",
    "img0.show()\n",
    "#get the salient features\n",
    "model = VGG16(weights='imagenet', include_top=True)\n",
    "model.summary()\n",
    "image_features = get_image_features(image_file_name, model)\n",
    "print image_features.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(1, 30, 300)\n"
     ]
    }
   ],
   "source": [
    "question = u\"Who is in this picture?\"\n",
    "language_features = get_question_features(question)\n",
    "print language_features.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "'''combine'''\n",
    "def build_combined_model(\n",
    "    number_of_LSTM              = 3,\n",
    "    number_of_hidden_units_LSTM = 512,\n",
    "    number_of_dense_layers      = 3,\n",
    "    number_of_hidden_units      = 1024,\n",
    "    activation_function         = 'tanh',\n",
    "    dropout_pct                 = 0.5\n",
    "):\n",
    "    \n",
    "    #input image\n",
    "    # not clear how to read 4096 from input\n",
    "    input_image = Input(shape=(length_vgg_features,),\n",
    "                       name=\"input_image\")\n",
    "    model_image = Reshape((length_vgg_features,), \n",
    "                          input_shape=(length_vgg_features,))(input_image)\n",
    "   \n",
    "    \n",
    "    #input language \n",
    "    # not clear our to read max_length_questions, 300 from input. Should i reshape it?\n",
    "    input_language = Input(shape=(max_length_questions,length_feature_space,),\n",
    "                          name=\"input_language\")\n",
    "    \n",
    "    #build a sequence of LSTM\n",
    "    model_language = LSTM(number_of_hidden_units_LSTM, \n",
    "                         return_sequences=True, \n",
    "                         name = \"lstm_1\")(input_language)\n",
    "    model_language = LSTM(number_of_hidden_units_LSTM, \n",
    "                          return_sequences=True,\n",
    "                         name = \"lstm_2\")(model_language)\n",
    "    model_language = LSTM(number_of_hidden_units_LSTM, \n",
    "                          return_sequences=False,\n",
    "                         name = \"lstm_3\")(model_language)\n",
    "\n",
    "    #concatenate 4096+512\n",
    "    model = concatenate([model_image, model_language])\n",
    "    \n",
    "    #Dense, Dropout\n",
    "    for _ in xrange(number_of_dense_layers):\n",
    "        model = Dense(number_of_hidden_units, \n",
    "                     kernel_initializer='uniform')(model)\n",
    "        model = Dropout(dropout_pct)(model)\n",
    "\n",
    "    model = Dense(1000,\n",
    "                 activation='softmax')(model)\n",
    "    \n",
    "\n",
    "    #create model from tensors\n",
    "    model = Model(inputs=[input_image, input_language], outputs = model)\n",
    "    \n",
    "    return model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "____________________________________________________________________________________________________\n",
      "Layer (type)                     Output Shape          Param #     Connected to                     \n",
      "====================================================================================================\n",
      "input_language (InputLayer)      (None, 30, 300)       0                                            \n",
      "____________________________________________________________________________________________________\n",
      "lstm_1 (LSTM)                    (None, 30, 512)       1665024     input_language[0][0]             \n",
      "____________________________________________________________________________________________________\n",
      "input_image (InputLayer)         (None, 4096)          0                                            \n",
      "____________________________________________________________________________________________________\n",
      "lstm_2 (LSTM)                    (None, 30, 512)       2099200     lstm_1[0][0]                     \n",
      "____________________________________________________________________________________________________\n",
      "reshape_3 (Reshape)              (None, 4096)          0           input_image[0][0]                \n",
      "____________________________________________________________________________________________________\n",
      "lstm_3 (LSTM)                    (None, 512)           2099200     lstm_2[0][0]                     \n",
      "____________________________________________________________________________________________________\n",
      "concatenate_3 (Concatenate)      (None, 4608)          0           reshape_3[0][0]                  \n",
      "                                                                   lstm_3[0][0]                     \n",
      "____________________________________________________________________________________________________\n",
      "dense_8 (Dense)                  (None, 1024)          4719616     concatenate_3[0][0]              \n",
      "____________________________________________________________________________________________________\n",
      "dropout_7 (Dropout)              (None, 1024)          0           dense_8[0][0]                    \n",
      "____________________________________________________________________________________________________\n",
      "dense_9 (Dense)                  (None, 1024)          1049600     dropout_7[0][0]                  \n",
      "____________________________________________________________________________________________________\n",
      "dropout_8 (Dropout)              (None, 1024)          0           dense_9[0][0]                    \n",
      "____________________________________________________________________________________________________\n",
      "dense_10 (Dense)                 (None, 1024)          1049600     dropout_8[0][0]                  \n",
      "____________________________________________________________________________________________________\n",
      "dropout_9 (Dropout)              (None, 1024)          0           dense_10[0][0]                   \n",
      "____________________________________________________________________________________________________\n",
      "dense_11 (Dense)                 (None, 1000)          1025000     dropout_9[0][0]                  \n",
      "====================================================================================================\n",
      "Total params: 13,707,240\n",
      "Trainable params: 13,707,240\n",
      "Non-trainable params: 0\n",
      "____________________________________________________________________________________________________\n"
     ]
    }
   ],
   "source": [
    "combined_model = build_combined_model()\n",
    "combined_model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "combined_model.load_weights(VQA_weights_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "combined_model.compile(loss='categorical_crossentropy', optimizer='rmsprop')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_output = combined_model.predict([image_features, language_features])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100.0 %  no\n",
      "000.0 %  zoo\n",
      "000.0 %  empty\n",
      "000.0 %  female\n",
      "000.0 %  fedora\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/gulli/miniconda2/envs/tensorflow/lib/python2.7/site-packages/ipykernel_launcher.py:5: DeprecationWarning: The file '/Users/gulli/Books/TF/code/git/tensorflowBook/Chapter5/FULL_labelencoder_trainval.pkl' has been generated with a joblib version less than 0.10. Please regenerate this pickle file.\n",
      "  \"\"\"\n"
     ]
    }
   ],
   "source": [
    "# This task here is represented as a classification into a 1000 top answers\n",
    "# this means some of the answers were not part of training and thus would \n",
    "# not show up in the result.\n",
    "# These 1000 answers are stored in the sklearn Encoder class\n",
    "labelencoder = joblib.load(label_encoder_file_name)\n",
    "for label in reversed(np.argsort(y_output)[0,-5:]):\n",
    "    print str(round(y_output[0,label]*100,2)).zfill(5), \"% \", labelencoder.inverse_transform(label)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
